bitkeeper revision 1.1236.1.154 (4249c430s6iKHaP4AAIWnJQScN1CyA)

author kaf24@firebug.cl.cam.ac.uk <kaf24@firebug.cl.cam.ac.uk>

Tue, 29 Mar 2005 21:10:08 +0000 (21:10 +0000)

committer kaf24@firebug.cl.cam.ac.uk <kaf24@firebug.cl.cam.ac.uk>

Tue, 29 Mar 2005 21:10:08 +0000 (21:10 +0000)
author kaf24@firebug.cl.cam.ac.uk <kaf24@firebug.cl.cam.ac.uk>
Tue, 29 Mar 2005 21:10:08 +0000 (21:10 +0000)
committer kaf24@firebug.cl.cam.ac.uk <kaf24@firebug.cl.cam.ac.uk>
Tue, 29 Mar 2005 21:10:08 +0000 (21:10 +0000)
diff --git a/xen/arch/ia64/xenmisc.c b/xen/arch/ia64/xenmisc.c

index 40055983db67e3c56d5371b2729e0d7095f193aa..2e4b4366585ebb8101d0b7cfc3c2fe1fcc05d76b 100644 (file)
--- a/xen/arch/ia64/xenmisc.c
+++ b/xen/arch/ia64/xenmisc.c
@@ -53,7 +53,7 @@ platform_is_hp_ski(void)
  }
  
  /* calls in xen/common code that are unused on ia64 */
-void synchronise_pagetables(unsigned long cpu_mask) { return; }
+void synchronise_execution_state(unsigned long cpu_mask) { }
  
  int grant_table_create(struct domain *d) { return 0; }
  void grant_table_destroy(struct domain *d)
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c

index dbebab53d6d2cc3ad23eb589d9920a610573a6a8..e26a509f2af284dfc812411efb137a7e7fc23487 100644 (file)
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -45,13 +45,18 @@
  static int opt_noreboot = 0;
  boolean_param("noreboot", opt_noreboot);
  
+struct percpu_ctxt {
+    struct exec_domain *curr_ed;
+} __cacheline_aligned;
+static struct percpu_ctxt percpu_ctxt[NR_CPUS];
+
  static void default_idle(void)
  {
-    __cli();
+    local_irq_disable();
      if ( !softirq_pending(smp_processor_id()) )
          safe_halt();
      else
-        __sti();
+        local_irq_enable();
  }
  
  static __attribute_used__ void idle_loop(void)
@@ -73,6 +78,8 @@ void startup_cpu_idle_loop(void)
  {
      /* Just some sanity to ensure that the scheduler is set up okay. */
      ASSERT(current->domain->id == IDLE_DOMAIN_ID);
+    percpu_ctxt[smp_processor_id()].curr_ed = current;
+    set_bit(smp_processor_id(), &current->domain->cpuset);
      domain_unpause_by_systemcontroller(current->domain);
      raise_softirq(SCHEDULE_SOFTIRQ);
      do_softirq();
@@ -110,7 +117,7 @@ void machine_restart(char * __unused)
              safe_halt();
      }
  
-    __sti();
+    local_irq_enable();
  
      /* Ensure we are the boot CPU. */
      if ( GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_physical_apicid )
@@ -307,10 +314,10 @@ unsigned long alloc_monitor_pagetable(struct exec_domain *ed)
      struct pfn_info *mmfn_info;
      struct domain *d = ed->domain;
  
-    ASSERT(!pagetable_val(ed->arch.monitor_table)); /* we should only get called once */
+    ASSERT(pagetable_val(ed->arch.monitor_table) == 0);
  
      mmfn_info = alloc_domheap_page(NULL);
-    ASSERT( mmfn_info ); 
+    ASSERT(mmfn_info != NULL); 
  
      mmfn = (unsigned long) (mmfn_info - frame_table);
      mpl2e = (l2_pgentry_t *) map_domain_mem(mmfn << PAGE_SHIFT);
@@ -326,7 +333,7 @@ unsigned long alloc_monitor_pagetable(struct exec_domain *ed)
  
      ed->arch.monitor_vtable = mpl2e;
  
-    // map the phys_to_machine map into the Read-Only MPT space for this domain
+    /* Map the p2m map into the Read-Only MPT space for this domain. */
      mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
          mk_l2_pgentry(pagetable_val(ed->arch.phys_table) | __PAGE_HYPERVISOR);
  
@@ -578,19 +585,10 @@ void toggle_guest_mode(struct exec_domain *ed)
          : "=r" (__r) : "r" (value), "0" (__r) );\
      __r; })
  
-static void switch_segments(
-    struct xen_regs *regs, struct exec_domain *p, struct exec_domain *n)
+static void load_segments(struct exec_domain *p, struct exec_domain *n)
  {
      int all_segs_okay = 1;
  
-    if ( !is_idle_task(p->domain) )
-    {
-        __asm__ __volatile__ ( "movl %%ds,%0" : "=m" (p->arch.user_ctxt.ds) );
-        __asm__ __volatile__ ( "movl %%es,%0" : "=m" (p->arch.user_ctxt.es) );
-        __asm__ __volatile__ ( "movl %%fs,%0" : "=m" (p->arch.user_ctxt.fs) );
-        __asm__ __volatile__ ( "movl %%gs,%0" : "=m" (p->arch.user_ctxt.gs) );
-    }
-
      /* Either selector != 0 ==> reload. */
      if ( unlikely(p->arch.user_ctxt.ds |
                    n->arch.user_ctxt.ds) )
@@ -654,7 +652,8 @@ static void switch_segments(
  
      if ( unlikely(!all_segs_okay) )
      {
-        unsigned long *rsp =
+        struct xen_regs *regs = get_execution_context();
+        unsigned long   *rsp =
              (n->arch.flags & TF_kernel_mode) ?
              (unsigned long *)regs->rsp : 
              (unsigned long *)n->arch.kernel_sp;
@@ -689,6 +688,24 @@ static void switch_segments(
      }
  }
  
+static void save_segments(struct exec_domain *p)
+{
+    __asm__ __volatile__ ( "movl %%ds,%0" : "=m" (p->arch.user_ctxt.ds) );
+    __asm__ __volatile__ ( "movl %%es,%0" : "=m" (p->arch.user_ctxt.es) );
+    __asm__ __volatile__ ( "movl %%fs,%0" : "=m" (p->arch.user_ctxt.fs) );
+    __asm__ __volatile__ ( "movl %%gs,%0" : "=m" (p->arch.user_ctxt.gs) );
+}
+
+static void clear_segments(void)
+{
+    __asm__ __volatile__ (
+        "movl %0,%%ds; "
+        "movl %0,%%es; "
+        "movl %0,%%fs; "
+        "movl %0,%%gs; swapgs; movl %0,%%gs"
+        : : "r" (0) );
+}
+
  long do_switch_to_user(void)
  {
      struct xen_regs       *regs = get_execution_context();
@@ -720,80 +737,96 @@ long do_switch_to_user(void)
  
  #elif defined(__i386__)
  
-#define switch_segments(_r, _p, _n) ((void)0)
+#define load_segments(_p, _n) ((void)0)
+#define save_segments(_p)     ((void)0)
+#define clear_segments()      ((void)0)
  
  #endif
  
-/*
- * This special macro can be used to load a debugging register
- */
  #define loaddebug(_ed,_reg) \
-               __asm__("mov %0,%%db" #_reg  \
-                       : /* no output */ \
-                       :"r" ((_ed)->debugreg[_reg]))
+       __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_ed)->debugreg[_reg]))
  
-void context_switch(struct exec_domain *prev_p, struct exec_domain *next_p)
+static void __context_switch(void)
  {
-#ifdef __i386__
-    struct tss_struct *tss = init_tss + smp_processor_id();
-#endif
      execution_context_t *stack_ec = get_execution_context();
+    unsigned int         cpu = smp_processor_id();
+    struct exec_domain  *p = percpu_ctxt[cpu].curr_ed;
+    struct exec_domain  *n = current;
  
-    __cli();
-
-    /* Switch guest general-register state. */
-    if ( !is_idle_task(prev_p->domain) )
+    if ( !is_idle_task(p->domain) )
      {
-        memcpy(&prev_p->arch.user_ctxt,
+        memcpy(&p->arch.user_ctxt,
                 stack_ec, 
                 sizeof(*stack_ec));
-        unlazy_fpu(prev_p);
-        CLEAR_FAST_TRAP(&prev_p->arch);
+        unlazy_fpu(p);
+        CLEAR_FAST_TRAP(&p->arch);
+        save_segments(p);
      }
  
-    if ( !is_idle_task(next_p->domain) )
-    {
-        memcpy(stack_ec,
-               &next_p->arch.user_ctxt,
-               sizeof(*stack_ec));
+    memcpy(stack_ec,
+           &n->arch.user_ctxt,
+           sizeof(*stack_ec));
  
-        /* Maybe switch the debug registers. */
-        if ( unlikely(next_p->arch.debugreg[7]) )
-        {
-            loaddebug(&next_p->arch, 0);
-            loaddebug(&next_p->arch, 1);
-            loaddebug(&next_p->arch, 2);
-            loaddebug(&next_p->arch, 3);
-            /* no 4 and 5 */
-            loaddebug(&next_p->arch, 6);
-            loaddebug(&next_p->arch, 7);
-        }
+    /* Maybe switch the debug registers. */
+    if ( unlikely(n->arch.debugreg[7]) )
+    {
+        loaddebug(&n->arch, 0);
+        loaddebug(&n->arch, 1);
+        loaddebug(&n->arch, 2);
+        loaddebug(&n->arch, 3);
+        /* no 4 and 5 */
+        loaddebug(&n->arch, 6);
+        loaddebug(&n->arch, 7);
+    }
  
-        if ( !VMX_DOMAIN(next_p) )
-        {
-            SET_FAST_TRAP(&next_p->arch);
+    if ( !VMX_DOMAIN(n) )
+    {
+        SET_FAST_TRAP(&n->arch);
  
  #ifdef __i386__
+        {
              /* Switch the kernel ring-1 stack. */
-            tss->esp1 = next_p->arch.kernel_sp;
-            tss->ss1  = next_p->arch.kernel_ss;
-#endif
+            struct tss_struct *tss = &init_tss[cpu];
+            tss->esp1 = n->arch.kernel_sp;
+            tss->ss1  = n->arch.kernel_ss;
          }
-
-        /* Switch page tables. */
-        write_ptbase(next_p);
+#endif
      }
  
-    set_current(next_p);
+    set_bit(cpu, &n->domain->cpuset);
+    write_ptbase(n);
+    clear_bit(cpu, &p->domain->cpuset);
  
-    __asm__ __volatile__ ("lgdt %0" : "=m" (*next_p->arch.gdt));
+    __asm__ __volatile__ ( "lgdt %0" : "=m" (*n->arch.gdt) );
+
+    percpu_ctxt[cpu].curr_ed = n;
+}
  
-    __sti();
  
-    if ( !VMX_DOMAIN(next_p) )
+void context_switch(struct exec_domain *prev, struct exec_domain *next)
+{
+    struct exec_domain *realprev;
+
+    local_irq_disable();
+
+    set_current(next);
+
+    if ( ((realprev = percpu_ctxt[smp_processor_id()]. curr_ed) == next) || 
+         is_idle_task(next->domain) )
      {
-        load_LDT(next_p);
-        switch_segments(stack_ec, prev_p, next_p);
+        local_irq_enable();
+    }
+    else
+    {
+        __context_switch();
+
+        local_irq_enable();
+        
+        if ( !VMX_DOMAIN(next) )
+        {
+            load_LDT(next);
+            load_segments(realprev, next);
+        }
      }
  
      /*
@@ -802,13 +835,27 @@ void context_switch(struct exec_domain *prev_p, struct exec_domain *next_p)
       * 'prev' (after this point, a dying domain's info structure may be freed
       * without warning). 
       */
-    clear_bit(EDF_RUNNING, &prev_p->ed_flags);
+    clear_bit(EDF_RUNNING, &prev->ed_flags);
  
-    schedule_tail(next_p);
+    schedule_tail(next);
  
      BUG();
  }
  
+static void __synchronise_lazy_execstate(void *unused)
+{
+    if ( percpu_ctxt[smp_processor_id()].curr_ed != current )
+    {
+        __context_switch();
+        load_LDT(current);
+        clear_segments();
+    }
+}
+void synchronise_lazy_execstate(unsigned long cpuset)
+{
+    smp_subset_call_function(__synchronise_lazy_execstate, NULL, 1, cpuset);
+}
+
  unsigned long __hypercall_create_continuation(
      unsigned int op, unsigned int nr_args, ...)
  {
@@ -947,13 +994,11 @@ void domain_relinquish_memory(struct domain *d)
  {
      struct exec_domain *ed;
  
-    /* Ensure that noone is running over the dead domain's page tables. */
-    synchronise_pagetables(~0UL);
+    BUG_ON(d->cpuset != 0);
  
      /* Release device mappings of other domains */
      gnttab_release_dev_mappings( d->grant_table );
  
-
      /* Exit shadow mode before deconstructing final guest page table. */
      shadow_mode_disable(d);
  
diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c

index 54e5caa7e0388dc5dabc9cb7acd6f26e0aa001cb..3ad344ce09193d6a45af582f4f2ec01bc1e9164b 100644 (file)
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -421,7 +421,7 @@ int construct_dom0(struct domain *d,
      update_pagetables(ed);
  
      /* Install the new page tables. */
-    __cli();
+    local_irq_disable();
      write_ptbase(ed);
  
      /* Copy the OS image and free temporary buffer. */
@@ -498,7 +498,7 @@ int construct_dom0(struct domain *d,
  
      /* Reinstate the caller's page tables. */
      write_ptbase(current);
-    __sti();
+    local_irq_enable();
  
  #if defined(__i386__)
      /* Destroy low mappings - they were only for our convenience. */
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c

index 0c8322bfc929ea92070b8ab9c478e296ae6613e4..c76dd791bc32ed2e762712368f04515fead2043e 100644 (file)
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -1147,16 +1147,13 @@ int get_page_type(struct pfn_info *page, u32 type)
                   * may be unnecessary (e.g., page was GDT/LDT) but those
                   * circumstances should be very rare.
                   */
-                struct exec_domain *ed;
-                unsigned long mask = 0;
-                for_each_exec_domain ( page_get_owner(page), ed )
-                    mask |= 1 << ed->processor;
-                mask = tlbflush_filter_cpuset(mask, page->tlbflush_timestamp);
+                unsigned long cpuset = tlbflush_filter_cpuset(
+                    page_get_owner(page)->cpuset, page->tlbflush_timestamp);
  
-                if ( unlikely(mask != 0) )
+                if ( unlikely(cpuset != 0) )
                  {
                      perfc_incrc(need_flush_tlb_flush);
-                    flush_tlb_mask(mask);
+                    flush_tlb_mask(cpuset);
                  }
  
                  /* We lose existing type, back pointer, and validity. */
@@ -2842,7 +2839,7 @@ void audit_domain(struct domain *d)
  
      if ( d != current->domain )
          domain_pause(d);
-    synchronise_pagetables(~0UL);
+    synchronise_lazy_execstate(~0UL);
  
      printk("pt base=%lx sh_info=%x\n",
             pagetable_val(d->exec_domain[0]->arch.guest_table)>>PAGE_SHIFT,
diff --git a/xen/arch/x86/shadow.c b/xen/arch/x86/shadow.c

index 4e755d059777671217070764a07c58339365e6f7..e47eccf8eeb521c94cb5704256651247ae6b2474 100644 (file)
--- a/xen/arch/x86/shadow.c
+++ b/xen/arch/x86/shadow.c
@@ -384,7 +384,6 @@ int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc)
      }   
  
      domain_pause(d);
-    synchronise_pagetables(~0UL);
  
      shadow_lock(d);
  
diff --git a/xen/arch/x86/smp.c b/xen/arch/x86/smp.c

index fc7dfc42b5eba8c44b4f35210d2ac736790bca1b..962417238fc6eee0673edca89cf653290c3731aa 100644 (file)
--- a/xen/arch/x86/smp.c
+++ b/xen/arch/x86/smp.c
@@ -59,9 +59,7 @@
   */
  
  /*
- * the following functions deal with sending IPIs between CPUs.
- *
- * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
+ * The following functions deal with sending IPIs between CPUs.
   */
  
  static inline int __prepare_ICR (unsigned int shortcut, int vector)
@@ -82,22 +80,22 @@ static inline void __send_IPI_shortcut(unsigned int shortcut, int vector)
       * of the value read we use an atomic rmw access to avoid costly
       * cli/sti.  Otherwise we use an even cheaper single atomic write
       * to the APIC.
-        */
+     */
      unsigned int cfg;
  
      /*
-        * Wait for idle.
-        */
+     * Wait for idle.
+     */
      apic_wait_icr_idle();
  
      /*
-        * No need to touch the target chip field
-        */
+     * No need to touch the target chip field
+     */
      cfg = __prepare_ICR(shortcut, vector);
  
      /*
-        * Send the IPI. The write to APIC_ICR fires this off.
-        */
+     * Send the IPI. The write to APIC_ICR fires this off.
+     */
      apic_write_around(APIC_ICR, cfg);
  }
  
@@ -111,106 +109,44 @@ static inline void send_IPI_mask(int mask, int vector)
      unsigned long cfg;
      unsigned long flags;
  
-    __save_flags(flags);
-    __cli();
+    local_irq_save(flags);
  
-               
      /*
       * Wait for idle.
       */
      apic_wait_icr_idle();
-               
+
      /*
       * prepare target chip field
       */
      cfg = __prepare_ICR2(mask);
      apic_write_around(APIC_ICR2, cfg);
-               
+
      /*
       * program the ICR 
       */
      cfg = __prepare_ICR(0, vector);
-                       
+
      /*
       * Send the IPI. The write to APIC_ICR fires this off.
       */
      apic_write_around(APIC_ICR, cfg);
  
-    __restore_flags(flags);
+    local_irq_restore(flags);
  }
  
  static inline void send_IPI_allbutself(int vector)
  {
      /*
-     * if there are no other CPUs in the system then
-     * we get an APIC send error if we try to broadcast.
-     * thus we have to avoid sending IPIs in this case.
+     * If there are no other CPUs in the system then we get an APIC send error 
+     * if we try to broadcast. thus we have to avoid sending IPIs in this case.
       */
-    if (!(smp_num_cpus > 1))
+    if ( smp_num_cpus <= 1 )
          return;
  
      __send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
  }
  
-/*
- * ********* XEN NOTICE **********
- * I've left the following comments lying around as they look liek they might
- * be useful to get multiprocessor guest OSes going. However, I suspect the
- * issues we face will be quite different so I've ripped out all the
- * TLBSTATE logic (I didn't understand it anyway :-). These comments do
- * not apply to Xen, therefore! -- Keir (8th Oct 2003).
- */
-/*
- *     Smarter SMP flushing macros. 
- *             c/o Linus Torvalds.
- *
- *     These mean you can really definitely utterly forget about
- *     writing to user space from interrupts. (Its not allowed anyway).
- *
- *     Optimizations Manfred Spraul <manfred@colorfullife.com>
- *
- * The flush IPI assumes that a thread switch happens in this order:
- * [cpu0: the cpu that switches]
- * 1) switch_mm() either 1a) or 1b)
- * 1a) thread switch to a different mm
- * 1a1) clear_bit(cpu, &old_mm.cpu_vm_mask);
- *     Stop ipi delivery for the old mm. This is not synchronized with
- *     the other cpus, but smp_invalidate_interrupt ignore flush ipis
- *     for the wrong mm, and in the worst case we perform a superflous
- *     tlb flush.
- * 1a2) set cpu_tlbstate to TLBSTATE_OK
- *     Now the smp_invalidate_interrupt won't call leave_mm if cpu0
- *     was in lazy tlb mode.
- * 1a3) update cpu_tlbstate[].active_mm
- *     Now cpu0 accepts tlb flushes for the new mm.
- * 1a4) set_bit(cpu, &new_mm.cpu_vm_mask);
- *     Now the other cpus will send tlb flush ipis.
- * 1a4) change cr3.
- * 1b) thread switch without mm change
- *     cpu_tlbstate[].active_mm is correct, cpu0 already handles
- *     flush ipis.
- * 1b1) set cpu_tlbstate to TLBSTATE_OK
- * 1b2) test_and_set the cpu bit in cpu_vm_mask.
- *     Atomically set the bit [other cpus will start sending flush ipis],
- *     and test the bit.
- * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
- * 2) switch %%esp, ie current
- *
- * The interrupt must handle 2 special cases:
- * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
- * - the cpu performs speculative tlb reads, i.e. even if the cpu only
- *   runs in kernel space, the cpu could load tlb entries for user space
- *   pages.
- *
- * The good news is that cpu_tlbstate is local to each cpu, no
- * write/read ordering problems.
- *
- * TLB flush IPI:
- *
- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
- * 2) Leave the mm if we are in the lazy tlb mode.
- */
-
  static spinlock_t flush_lock = SPIN_LOCK_UNLOCKED;
  static unsigned long flush_cpumask;
  
@@ -226,21 +162,19 @@ void flush_tlb_mask(unsigned long mask)
  {
      ASSERT(local_irq_is_enabled());
      
-    if ( mask & (1 << smp_processor_id()) )
+    if ( mask & (1UL << smp_processor_id()) )
      {
          local_flush_tlb();
-        mask &= ~(1 << smp_processor_id());
+        mask &= ~(1UL << smp_processor_id());
      }
  
      if ( mask != 0 )
      {
          spin_lock(&flush_lock);
-
          flush_cpumask = mask;
          send_IPI_mask(mask, INVALIDATE_TLB_VECTOR);
          while ( flush_cpumask != 0 )
              cpu_relax();
-
          spin_unlock(&flush_lock);
      }
  }
@@ -254,7 +188,8 @@ void new_tlbflush_clock_period(void)
      if ( smp_num_cpus > 1 )
      {
          spin_lock(&flush_lock);
-        flush_cpumask = ((1 << smp_num_cpus) - 1) & ~(1 << smp_processor_id());
+        flush_cpumask  = (1UL << smp_num_cpus) - 1;
+        flush_cpumask &= ~(1UL << smp_processor_id());
          send_IPI_allbutself(INVALIDATE_TLB_VECTOR);
          while ( flush_cpumask != 0 )
              cpu_relax();
@@ -266,124 +201,138 @@ void new_tlbflush_clock_period(void)
      tlbflush_clock++;
  }
  
-static void flush_tlb_all_pge_ipi(void* info)
+static void flush_tlb_all_pge_ipi(void *info)
  {
      __flush_tlb_pge();
  }
  
  void flush_tlb_all_pge(void)
  {
-    smp_call_function (flush_tlb_all_pge_ipi,0,1,1);
+    smp_call_function(flush_tlb_all_pge_ipi, 0, 1, 1);
      __flush_tlb_pge();
  }
  
  void smp_send_event_check_mask(unsigned long cpu_mask)
  {
-    cpu_mask &= ~(1<<smp_processor_id());
+    cpu_mask &= ~(1UL << smp_processor_id());
      if ( cpu_mask != 0 )
          send_IPI_mask(cpu_mask, EVENT_CHECK_VECTOR);
  }
  
  /*
- * Structure and data for smp_call_function(). This is designed to minimise
- * static memory requirements. It also looks cleaner.
+ * Structure and data for smp_call_function().
   */
-static spinlock_t call_lock = SPIN_LOCK_UNLOCKED;
  
  struct call_data_struct {
      void (*func) (void *info);
      void *info;
-    atomic_t started;
-    atomic_t finished;
+    unsigned long started;
+    unsigned long finished;
      int wait;
  };
  
-static struct call_data_struct * call_data;
+static spinlock_t call_lock = SPIN_LOCK_UNLOCKED;
+static struct call_data_struct *call_data;
  
  /*
- * this function sends a 'generic call function' IPI to all other CPUs
- * in the system.
- */
-
-int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
-                       int wait)
-/*
- * [SUMMARY] Run a function on all other CPUs.
- * <func> The function to run. This must be fast and non-blocking.
- * <info> An arbitrary pointer to pass to the function.
- * <nonatomic> currently unused.
- * <wait> If true, wait (atomically) until function has completed on other CPUs.
- * [RETURNS] 0 on success, else a negative status code. Does not return until
- * remote CPUs are nearly ready to execute <<func>> or are or have executed.
- *
- * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler, or bottom halfs.
+ * Run a function on all other CPUs.
+ *  @func: The function to run. This must be fast and non-blocking.
+ *  @info: An arbitrary pointer to pass to the function.
+ *  @wait: If true, spin until function has completed on other CPUs.
+ *  Returns: 0 on success, else a negative status code.
   */
+int smp_call_function(
+    void (*func) (void *info), void *info, int unused, int wait)
  {
      struct call_data_struct data;
-    int cpus = smp_num_cpus-1;
+    unsigned long cpuset;
+
+    ASSERT(local_irq_is_enabled());
  
-    if (!cpus)
+    cpuset = ((1UL << smp_num_cpus) - 1) & ~(1UL << smp_processor_id());
+    if ( cpuset == 0 )
          return 0;
  
      data.func = func;
      data.info = info;
-    atomic_set(&data.started, 0);
+    data.started = data.finished = 0;
      data.wait = wait;
-    if (wait)
-        atomic_set(&data.finished, 0);
+
+    spin_lock(&call_lock);
+
+    call_data = &data;
+    wmb();
+
+    send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+
+    while ( (wait ? data.finished : data.started) != cpuset )
+        cpu_relax();
+
+    spin_unlock(&call_lock);
+
+    return 0;
+}
+
+/* Run a function on a subset of CPUs (may include local CPU). */
+int smp_subset_call_function(
+    void (*func) (void *info), void *info, int wait, unsigned long cpuset)
+{
+    struct call_data_struct data;
  
      ASSERT(local_irq_is_enabled());
  
+    if ( cpuset & (1UL << smp_processor_id()) )
+    {
+        local_irq_disable();
+        (*func)(info);
+        local_irq_enable();
+    }
+
+    cpuset &= ((1UL << smp_num_cpus) - 1) & ~(1UL << smp_processor_id());
+    if ( cpuset == 0 )
+        return 0;
+
+    data.func = func;
+    data.info = info;
+    data.started = data.finished = 0;
+    data.wait = wait;
+
      spin_lock(&call_lock);
  
      call_data = &data;
      wmb();
-    /* Send a message to all other CPUs and wait for them to respond */
-    send_IPI_allbutself(CALL_FUNCTION_VECTOR);
  
-    /* Wait for response */
-    while (atomic_read(&data.started) != cpus)
-        barrier();
+    send_IPI_mask(cpuset, CALL_FUNCTION_VECTOR);
  
-    if (wait)
-        while (atomic_read(&data.finished) != cpus)
-            barrier();
+    while ( (wait ? data.finished : data.started) != cpuset )
+        cpu_relax();
  
      spin_unlock(&call_lock);
  
      return 0;
  }
  
-static void stop_this_cpu (void * dummy)
+static void stop_this_cpu (void *dummy)
  {
-    /*
-     * Remove this CPU:
-     */
      clear_bit(smp_processor_id(), &cpu_online_map);
-    __cli();
+
      disable_local_APIC();
-    for(;;) __asm__("hlt");
-}
  
-/*
- * this function calls the 'stop' function on all other CPUs in the system.
- */
+    for ( ; ; )
+        __asm__ __volatile__ ( "hlt" );
+}
  
  void smp_send_stop(void)
  {
+    /* Stop all other CPUs in the system. */
      smp_call_function(stop_this_cpu, NULL, 1, 0);
      smp_num_cpus = 1;
  
-    __cli();
+    local_irq_disable();
      disable_local_APIC();
-    __sti();
+    local_irq_enable();
  }
  
-/*
- * Nothing to do, as all the work is done automatically when
- * we return from the interrupt.
- */
  asmlinkage void smp_event_check_interrupt(void)
  {
      ack_APIC_irq();
@@ -394,23 +343,20 @@ asmlinkage void smp_call_function_interrupt(void)
  {
      void (*func) (void *info) = call_data->func;
      void *info = call_data->info;
-    int wait = call_data->wait;
  
      ack_APIC_irq();
      perfc_incrc(ipis);
  
-    /*
-     * Notify initiating CPU that I've grabbed the data and am
-     * about to execute the function
-     */
-    mb();
-    atomic_inc(&call_data->started);
-    /*
-     * At this point the info structure may be out of scope unless wait==1
-     */
-    (*func)(info);
-    if (wait) {
+    if ( call_data->wait )
+    {
+        (*func)(info);
+        mb();
+        set_bit(smp_processor_id(), &call_data->finished);
+    }
+    else
+    {
          mb();
-        atomic_inc(&call_data->finished);
+        set_bit(smp_processor_id(), &call_data->started);
+        (*func)(info);
      }
  }
diff --git a/xen/arch/x86/x86_32/mm.c b/xen/arch/x86/x86_32/mm.c

index 8c8897a2831dae34bbde10b9820c0b36b08eb580..ac1f10def532098fbf25cead274530874cef80d8 100644 (file)
--- a/xen/arch/x86/x86_32/mm.c
+++ b/xen/arch/x86/x86_32/mm.c
@@ -180,22 +180,6 @@ void subarch_init_memory(struct domain *dom_xen)
      }
  }
  
-/*
- * Allows shooting down of borrowed page-table use on specific CPUs.
- * Specifically, we borrow page tables when running the idle domain.
- */
-static void __synchronise_pagetables(void *mask)
-{
-    struct exec_domain *ed = current;
-    if ( ((unsigned long)mask & (1 << ed->processor)) &&
-         is_idle_task(ed->domain) )
-        write_ptbase(ed);
-}
-void synchronise_pagetables(unsigned long cpu_mask)
-{
-    __synchronise_pagetables((void *)cpu_mask);
-    smp_call_function(__synchronise_pagetables, (void *)cpu_mask, 1, 1);
-}
  
  long do_stack_switch(unsigned long ss, unsigned long esp)
  {
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c

index d0717bfab73d2ec1fc0b2ebbddca2dbddd87b77f..d5e925492507b6f09d0e53f8aa665f1df42fca55 100644 (file)
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -236,23 +236,6 @@ void subarch_init_memory(struct domain *dom_xen)
      }
  }
  
-/*
- * Allows shooting down of borrowed page-table use on specific CPUs.
- * Specifically, we borrow page tables when running the idle domain.
- */
-static void __synchronise_pagetables(void *mask)
-{
-    struct exec_domain *ed = current;
-    if ( ((unsigned long)mask & (1 << ed->processor)) &&
-         is_idle_task(ed->domain) )
-        write_ptbase(ed);
-}
-void synchronise_pagetables(unsigned long cpu_mask)
-{
-    __synchronise_pagetables((void *)cpu_mask);
-    smp_call_function(__synchronise_pagetables, (void *)cpu_mask, 1, 1);
-}
-
  long do_stack_switch(unsigned long ss, unsigned long esp)
  {
      if ( (ss & 3) != 3 )
diff --git a/xen/common/dom0_ops.c b/xen/common/dom0_ops.c

index d20a851e5dc915b24ac49ff900bfe022c6e9cf3c..2c9880d53d0008da66dcfe56bac15c973313731e 100644 (file)
--- a/xen/common/dom0_ops.c
+++ b/xen/common/dom0_ops.c
@@ -266,7 +266,6 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
          else
          {
              exec_domain_pause(ed);
-            synchronise_pagetables(~0UL);
              if ( ed->processor != (cpu % smp_num_cpus) )
                  set_bit(EDF_MIGRATED, &ed->ed_flags);
              set_bit(EDF_CPUPINNED, &ed->ed_flags);
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c

index 4cae1d2a6373f5dd6634dc495146d625b3751d2b..39f50983c2766da20f9015d913576b2f0e6c04e3 100644 (file)
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -534,8 +534,6 @@ void free_domheap_pages(struct pfn_info *pg, unsigned int order)
  {
      int            i, drop_dom_ref;
      struct domain *d = page_get_owner(pg);
-    struct exec_domain *ed;
-    int cpu_mask = 0;
  
      ASSERT(!in_irq());
  
@@ -557,14 +555,11 @@ void free_domheap_pages(struct pfn_info *pg, unsigned int order)
          /* NB. May recursively lock from domain_relinquish_memory(). */
          spin_lock_recursive(&d->page_alloc_lock);
  
-        for_each_exec_domain ( d, ed )
-            cpu_mask |= 1 << ed->processor;
-
          for ( i = 0; i < (1 << order); i++ )
          {
              ASSERT((pg[i].u.inuse.type_info & PGT_count_mask) == 0);
              pg[i].tlbflush_timestamp  = tlbflush_current_time();
-            pg[i].u.free.cpu_mask     = cpu_mask;
+            pg[i].u.free.cpu_mask     = d->cpuset;
              list_del(&pg[i].list);
          }
  
diff --git a/xen/common/schedule.c b/xen/common/schedule.c

index 1ce8fa22d5a038b2a7bda3ef57241204a2565356..dddb96268addfd2a55d236a017ab1cdae2511d7d 100644 (file)
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -192,7 +192,6 @@ void sched_add_domain(struct exec_domain *ed)
  
  void sched_rem_domain(struct exec_domain *ed) 
  {
-
      rem_ac_timer(&ed->timer);
      SCHED_OP(rem_task, ed);
      TRACE_3D(TRC_SCHED_DOM_REM, ed->domain->id, ed->eid, ed);
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h

index d2d68e7cc28da8ea0ac6f0f6166f66b01e72a2a7..17357bbcb4cc7116d131c33bd304c96cb22e56ef 100644 (file)
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -205,12 +205,6 @@ static inline int get_page_and_type(struct pfn_info *page,
  
  int check_descriptor(struct desc_struct *d);
  
-/*
- * Use currently-executing domain's pagetables on the specified CPUs.
- * i.e., stop borrowing someone else's tables if you are the idle domain.
- */
-void synchronise_pagetables(unsigned long cpu_mask);
-
  /*
   * The MPT (machine->physical mapping table) is an array of word-sized
   * values, indexed on machine frame number. It is expected that guest OSes
diff --git a/xen/include/public/xen.h b/xen/include/public/xen.h

index bcc5db1d8d9d7f4401c25732c1f02870c6abe59e..145dc82a3208411b7b017721319ffeb45a22c812 100644 (file)
--- a/xen/include/public/xen.h
+++ b/xen/include/public/xen.h
@@ -124,11 +124,11 @@
   *   ptr[:2]  -- Machine address of new page-table base to install in MMU
   *               when in user space.
   * 
- *   val[7:0] == MMUEXT_TLB_FLUSH:
- *   No additional arguments.
+ *   val[7:0] == MMUEXT_TLB_FLUSH_LOCAL:
+ *   No additional arguments. Flushes local TLB.
   * 
- *   val[7:0] == MMUEXT_INVLPG:
- *   ptr[:2]  -- Linear address to be flushed from the TLB.
+ *   val[7:0] == MMUEXT_INVLPG_LOCAL:
+ *   ptr[:2]  -- Linear address to be flushed from the local TLB.
   * 
   *   val[7:0] == MMUEXT_FLUSH_CACHE:
   *   No additional arguments. Writes back and flushes cache contents.
@@ -154,6 +154,12 @@
   *   val[7:0] == MMUEXT_REASSIGN_PAGE:
   *   ptr[:2]  -- A machine address within the page to be reassigned to the FD.
   *               (NB. page must currently belong to the calling domain).
+ * 
+ *   val[7:0] == MMUEXT_TLB_FLUSH_MULTI:
+ *   Flush TLBs of VCPUs specified in @mask.
+ * 
+ *   val[7:0] == MMUEXT_INVLPG_MULTI:
+ *   ptr[:2]  -- Linear address to be flushed from TLB of VCPUs in @mask.
   */
  #define MMU_NORMAL_PT_UPDATE     0 /* checked '*ptr = val'. ptr is MA.       */
  #define MMU_MACHPHYS_UPDATE      2 /* ptr = MA of frame to modify entry for  */
@@ -164,8 +170,8 @@
  #define MMUEXT_PIN_L4_TABLE      3 /* ptr = MA of frame to pin               */
  #define MMUEXT_UNPIN_TABLE       4 /* ptr = MA of frame to unpin             */
  #define MMUEXT_NEW_BASEPTR       5 /* ptr = MA of new pagetable base         */
-#define MMUEXT_TLB_FLUSH         6 /* ptr = NULL                             */
-#define MMUEXT_INVLPG            7 /* ptr = VA to invalidate                 */
+#define MMUEXT_TLB_FLUSH_LOCAL   6 /* ptr = NULL                             */
+#define MMUEXT_INVLPG_LOCAL      7 /* ptr = VA to invalidate                 */
  #define MMUEXT_FLUSH_CACHE       8
  #define MMUEXT_SET_LDT           9 /* ptr = VA of table; val = # entries     */
  #define MMUEXT_SET_FOREIGNDOM   10 /* val[31:16] = dom                       */
@@ -173,6 +179,8 @@
  #define MMUEXT_TRANSFER_PAGE    12 /* ptr = MA of frame; val[31:16] = dom    */
  #define MMUEXT_REASSIGN_PAGE    13
  #define MMUEXT_NEW_USER_BASEPTR 14
+#define MMUEXT_TLB_FLUSH_MULTI  15 /* ptr = NULL; mask = VCPUs to flush      */
+#define MMUEXT_INVLPG_MULTI     16 /* ptr = VA to inval.; mask = VCPUs       */
  #define MMUEXT_CMD_MASK        255
  #define MMUEXT_CMD_SHIFT         8
  
@@ -180,6 +188,9 @@
  #define UVMF_FLUSH_TLB          1 /* Flush entire TLB. */
  #define UVMF_INVLPG             2 /* Flush the VA mapping being updated. */
  
+/* Backwards source compatibility. */
+#define MMUEXT_TLB_FLUSH        MMUEXT_TLB_FLUSH_LOCAL
+#define MMUEXT_INVLPG           MMUEXT_INVLPG_LOCAL
  
  /*
   * Commands to HYPERVISOR_sched_op().
@@ -257,8 +268,9 @@ typedef u16 domid_t;
   */
  typedef struct
  {
-    memory_t ptr;    /* Machine address of PTE. */
-    memory_t val;    /* New contents of PTE.    */
+    memory_t ptr;       /* Machine address of PTE. */
+    memory_t val;       /* New contents of PTE.    */
+    /*unsigned long mask;*/ /* VCPU mask (certain extended commands). */
  } PACKED mmu_update_t;
  
  /*
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h

index 3098586f17c0cdae10119940db0c5f0ab7d44186..b28a955b0f5e5af0169ac79bc138346050deb154 100644 (file)
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -143,6 +143,9 @@ struct domain
  
      struct exec_domain *exec_domain[MAX_VIRT_CPUS];
  
+    /* Bitmask of CPUs on which this domain is running. */
+    unsigned long cpuset;
+
      struct arch_domain arch;
  };
  
@@ -250,6 +253,12 @@ void init_idle_task(void);
  void domain_wake(struct exec_domain *d);
  void domain_sleep(struct exec_domain *d);
  
+/*
+ * Force loading of currently-executing domain state on the specified set
+ * of CPUs. This is used to counteract lazy state switching where required.
+ */
+void synchronise_lazy_execstate(unsigned long cpuset);
+
  extern void context_switch(
      struct exec_domain *prev, 
      struct exec_domain *next);
@@ -330,14 +339,21 @@ static inline void exec_domain_pause(struct exec_domain *ed)
      ASSERT(ed != current);
      atomic_inc(&ed->pausecnt);
      domain_sleep(ed);
+    synchronise_lazy_execstate(ed->domain->cpuset & (1UL << ed->processor));
  }
  
  static inline void domain_pause(struct domain *d)
  {
      struct exec_domain *ed;
  
-    for_each_exec_domain(d, ed)
-        exec_domain_pause(ed);
+    for_each_exec_domain( d, ed )
+    {
+        ASSERT(ed != current);
+        atomic_inc(&ed->pausecnt);
+        domain_sleep(ed);
+    }
+
+    synchronise_lazy_execstate(d->cpuset);
  }
  
  static inline void exec_domain_unpause(struct exec_domain *ed)
@@ -351,7 +367,7 @@ static inline void domain_unpause(struct domain *d)
  {
      struct exec_domain *ed;
  
-    for_each_exec_domain(d, ed)
+    for_each_exec_domain( d, ed )
          exec_domain_unpause(ed);
  }
  
@@ -361,30 +377,26 @@ static inline void exec_domain_unblock(struct exec_domain *ed)
          domain_wake(ed);
  }
  
-static inline void domain_unblock(struct domain *d)
-{
-    struct exec_domain *ed;
-
-    for_each_exec_domain(d, ed)
-        exec_domain_unblock(ed);
-}
-
  static inline void domain_pause_by_systemcontroller(struct domain *d)
  {
      struct exec_domain *ed;
  
-    for_each_exec_domain(d, ed) {
+    for_each_exec_domain ( d, ed )
+    {
          ASSERT(ed != current);
          if ( !test_and_set_bit(EDF_CTRLPAUSE, &ed->ed_flags) )
              domain_sleep(ed);
      }
+
+    synchronise_lazy_execstate(d->cpuset);
  }
  
  static inline void domain_unpause_by_systemcontroller(struct domain *d)
  {
      struct exec_domain *ed;
  
-    for_each_exec_domain(d, ed) {
+    for_each_exec_domain ( d, ed )
+    {
          if ( test_and_clear_bit(EDF_CTRLPAUSE, &ed->ed_flags) )
              domain_wake(ed);
      }
diff --git a/xen/include/xen/smp.h b/xen/include/xen/smp.h

index 13e370cdcac8e3144cff31754dda86ae1d0373b9..f3f08127b6410e985cbea5c555e0b637412ddbef 100644 (file)
--- a/xen/include/xen/smp.h
+++ b/xen/include/xen/smp.h
@@ -43,8 +43,10 @@ extern void smp_commence(void);
  /*
   * Call a function on all other processors
   */
-extern int smp_call_function (void (*func) (void *info), void *info,
-                             int retry, int wait);
+extern int smp_call_function(
+    void (*func) (void *info), void *info, int retry, int wait);
+extern int smp_subset_call_function(
+    void (*func) (void *info), void *info, int wait, unsigned long cpuset);
  
  /*
   * True once the per process idle is forked
@@ -84,7 +86,8 @@ extern volatile int smp_msg_id;
  #define kernel_lock()
  #define cpu_logical_map(cpu)                   0
  #define cpu_number_map(cpu)                    0
-#define smp_call_function(func,info,retry,wait)        ({ 0; })
+#define smp_call_function(func,info,retry,wait)        0
+#define smp_subset_call_function(f,i,w,c)      ({ if ( (c&1) ) (*f)(i); 0; })
  #define cpu_online_map                         1
  
  #endif
author	kaf24@firebug.cl.cam.ac.uk <kaf24@firebug.cl.cam.ac.uk>
	Tue, 29 Mar 2005 21:10:08 +0000 (21:10 +0000)
committer	kaf24@firebug.cl.cam.ac.uk <kaf24@firebug.cl.cam.ac.uk>
	Tue, 29 Mar 2005 21:10:08 +0000 (21:10 +0000)
xen/arch/ia64/xenmisc.c		patch \| blob \| history
xen/arch/x86/domain.c		patch \| blob \| history
xen/arch/x86/domain_build.c		patch \| blob \| history
xen/arch/x86/mm.c		patch \| blob \| history
xen/arch/x86/shadow.c		patch \| blob \| history
xen/arch/x86/smp.c		patch \| blob \| history
xen/arch/x86/x86_32/mm.c		patch \| blob \| history
xen/arch/x86/x86_64/mm.c		patch \| blob \| history
xen/common/dom0_ops.c		patch \| blob \| history
xen/common/page_alloc.c		patch \| blob \| history
xen/common/schedule.c		patch \| blob \| history
xen/include/asm-x86/mm.h		patch \| blob \| history
xen/include/public/xen.h		patch \| blob \| history
xen/include/xen/sched.h		patch \| blob \| history
xen/include/xen/smp.h		patch \| blob \| history